Group Project

Import Packages

library("ggplot2")
library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.4     v purrr   0.3.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library('geosphere')
library("ggmap")
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.

Importing Data

# Reading in the sample CSV of rider data we made
rider_2019_sample <- read.csv('sample.csv', stringsAsFactors = TRUE)
head(rider_2019_sample)
##   tripduration                starttime                 stoptime
## 1          564 2019-04-11 07:44:36.0580 2019-04-11 07:54:00.1840
## 2         1158 2019-05-15 18:00:33.3890 2019-05-15 18:19:52.0150
## 3          763 2019-03-25 13:27:50.4260 2019-03-25 13:40:33.7960
## 4          915 2019-06-21 15:52:07.8340 2019-06-21 16:07:23.6810
## 5         1368 2019-06-01 18:42:22.9500 2019-06-01 19:05:11.7510
## 6          267 2019-07-31 18:47:05.5630 2019-07-31 18:51:33.0870
##   start.station.id       start.station.name start.station.latitude
## 1             3711       E 13 St & Avenue A               40.72967
## 2             3016        Kent Ave & N 7 St               40.72037
## 3              382  University Pl & E 14 St               40.73493
## 4              359       E 47 St & Park Ave               40.75510
## 5             3295 Central Park W & W 96 St               40.79127
## 6             3377     Carroll St & Bond St               40.67861
##   start.station.longitude end.station.id  end.station.name end.station.latitude
## 1               -73.98068            168   W 18 St & 6 Ave             40.73971
## 2               -73.96165           3016 Kent Ave & N 7 St             40.72037
## 3               -73.99201            459  W 20 St & 11 Ave             40.74674
## 4               -73.97499            483   E 12 St & 3 Ave             40.73223
## 5               -73.96484           3142   1 Ave & E 62 St             40.76123
## 6               -73.99037           3398   Smith St & 9 St             40.67470
##   end.station.longitude bikeid   usertype birth.year gender
## 1             -73.99456  29807 Subscriber       1994      1
## 2             -73.96165  34411 Subscriber       1974      1
## 3             -74.00776  16078 Subscriber       1961      1
## 4             -73.98890  29904 Subscriber       1964      2
## 5             -73.96094  30247   Customer       1969      0
## 6             -73.99786  20315 Subscriber       1971      1
# Reading in the weather data set
weather_data <- read.csv('NYCWeather2019.csv', stringsAsFactors = TRUE)
head(weather_data)
##       STATION                        NAME     DATE AWND PRCP SNOW SNWD TAVG
## 1 USW00094728 NY CITY CENTRAL PARK, NY US 1/1/2019   NA 0.06    0    0   NA
## 2 USW00094728 NY CITY CENTRAL PARK, NY US 1/2/2019   NA 0.00    0    0   NA
## 3 USW00094728 NY CITY CENTRAL PARK, NY US 1/3/2019   NA 0.00    0    0   NA
## 4 USW00094728 NY CITY CENTRAL PARK, NY US 1/4/2019   NA 0.00    0    0   NA
## 5 USW00094728 NY CITY CENTRAL PARK, NY US 1/5/2019   NA 0.50    0    0   NA
## 6 USW00094728 NY CITY CENTRAL PARK, NY US 1/6/2019   NA 0.00    0    0   NA
##   TMAX TMIN
## 1   58   39
## 2   40   35
## 3   44   37
## 4   47   35
## 5   47   41
## 6   49   31

Initial Data Summary

# Initial summary of rider data set
str(rider_2019_sample)
## 'data.frame':    100000 obs. of  15 variables:
##  $ tripduration           : int  564 1158 763 915 1368 267 661 1112 520 512 ...
##  $ starttime              : Factor w/ 99999 levels "2019-01-01 00:56:30.7720",..: 18803 28405 14066 41002 34169 54789 95279 5247 68397 75686 ...
##  $ stoptime               : Factor w/ 100000 levels "2019-01-01 01:34:45.0200",..: 18804 28409 14065 41001 34174 54787 95282 5246 68395 75682 ...
##  $ start.station.id       : Factor w/ 825 levels "116","119","120",..: 621 86 688 538 263 348 749 80 259 545 ...
##  $ start.station.name     : Factor w/ 894 levels "1 Ave & E 110 St",..: 352 545 760 386 250 234 797 672 440 99 ...
##  $ start.station.latitude : num  40.7 40.7 40.7 40.8 40.8 ...
##  $ start.station.longitude: num  -74 -74 -74 -74 -74 ...
##  $ end.station.id         : Factor w/ 828 levels "116","119","120",..: 15 86 752 774 184 369 623 27 333 509 ...
##  $ end.station.name       : Factor w/ 890 levels "1 Ave & E 110 St",..: 793 549 795 350 7 714 787 371 598 92 ...
##  $ end.station.latitude   : num  40.7 40.7 40.7 40.7 40.8 ...
##  $ end.station.longitude  : num  -74 -74 -74 -74 -74 ...
##  $ bikeid                 : int  29807 34411 16078 29904 30247 20315 40128 33989 29972 20897 ...
##  $ usertype               : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 1 2 1 2 2 2 ...
##  $ birth.year             : int  1994 1974 1961 1964 1969 1971 1969 1960 1972 1966 ...
##  $ gender                 : int  1 1 1 2 0 1 0 1 1 1 ...
summary(rider_2019_sample)
##   tripduration                          starttime    
##  Min.   :     61.0   2019-11-22 17:59:58.4760:    2  
##  1st Qu.:    362.0   2019-01-01 00:56:30.7720:    1  
##  Median :    614.0   2019-01-01 01:35:30.5010:    1  
##  Mean   :    950.8   2019-01-01 02:04:41.7180:    1  
##  3rd Qu.:   1075.0   2019-01-01 02:25:28.9700:    1  
##  Max.   :2769536.0   2019-01-01 02:33:50.6550:    1  
##                      (Other)                 :99993  
##                      stoptime     start.station.id
##  2019-01-01 01:34:45.0200:    1   519    :  810   
##  2019-01-01 01:51:55.8730:    1   3255   :  617   
##  2019-01-01 02:13:13.4810:    1   497    :  602   
##  2019-01-01 02:29:13.1090:    1   402    :  561   
##  2019-01-01 03:04:23.8640:    1   435    :  551   
##  2019-01-01 04:09:48.6020:    1   (Other):96523   
##  (Other)                 :99994   NA's   :  336   
##              start.station.name start.station.latitude start.station.longitude
##  Pershing Square North:  810    Min.   :40.66          Min.   :-74.03         
##  8 Ave & W 31 St      :  617    1st Qu.:40.72          1st Qu.:-74.00         
##  E 17 St & Broadway   :  602    Median :40.74          Median :-73.98         
##  Broadway & E 22 St   :  561    Mean   :40.74          Mean   :-73.98         
##  W 21 St & 6 Ave      :  551    3rd Qu.:40.76          3rd Qu.:-73.97         
##  Broadway & E 14 St   :  548    Max.   :40.85          Max.   :-73.88         
##  (Other)              :96311                                                  
##  end.station.id               end.station.name end.station.latitude
##  519    :  792   Pershing Square North:  792   Min.   :40.66       
##  402    :  636   Broadway & E 22 St   :  636   1st Qu.:40.72       
##  3255   :  632   8 Ave & W 31 St      :  632   Median :40.74       
##  497    :  623   E 17 St & Broadway   :  623   Mean   :40.74       
##  285    :  547   Broadway & E 14 St   :  547   3rd Qu.:40.76       
##  (Other):96426   W 21 St & 6 Ave      :  544   Max.   :40.86       
##  NA's   :  344   (Other)              :96226                       
##  end.station.longitude     bikeid            usertype       birth.year  
##  Min.   :-74.03        Min.   :14529   Customer  :14054   Min.   :1885  
##  1st Qu.:-74.00        1st Qu.:25346   Subscriber:85946   1st Qu.:1970  
##  Median :-73.99        Median :30918                      Median :1983  
##  Mean   :-73.98        Mean   :29674                      Mean   :1980  
##  3rd Qu.:-73.97        3rd Qu.:35049                      3rd Qu.:1990  
##  Max.   :-73.89        Max.   :42046                      Max.   :2003  
##                                                                         
##      gender     
##  Min.   :0.000  
##  1st Qu.:1.000  
##  Median :1.000  
##  Mean   :1.161  
##  3rd Qu.:1.000  
##  Max.   :2.000  
## 
# Initial summart of weather data set
str(weather_data)
## 'data.frame':    365 obs. of  10 variables:
##  $ STATION: Factor w/ 1 level "USW00094728": 1 1 1 1 1 1 1 1 1 1 ...
##  $ NAME   : Factor w/ 1 level "NY CITY CENTRAL PARK, NY US": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DATE   : Factor w/ 365 levels "1/1/2019","1/10/2019",..: 1 12 23 26 27 28 29 30 31 2 ...
##  $ AWND   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PRCP   : num  0.06 0 0 0 0.5 0 0 0.17 0.06 0 ...
##  $ SNOW   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNWD   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ TAVG   : logi  NA NA NA NA NA NA ...
##  $ TMAX   : int  58 40 44 47 47 49 34 45 45 34 ...
##  $ TMIN   : int  39 35 37 35 41 31 25 34 34 28 ...
summary(rider_2019_sample)
##   tripduration                          starttime    
##  Min.   :     61.0   2019-11-22 17:59:58.4760:    2  
##  1st Qu.:    362.0   2019-01-01 00:56:30.7720:    1  
##  Median :    614.0   2019-01-01 01:35:30.5010:    1  
##  Mean   :    950.8   2019-01-01 02:04:41.7180:    1  
##  3rd Qu.:   1075.0   2019-01-01 02:25:28.9700:    1  
##  Max.   :2769536.0   2019-01-01 02:33:50.6550:    1  
##                      (Other)                 :99993  
##                      stoptime     start.station.id
##  2019-01-01 01:34:45.0200:    1   519    :  810   
##  2019-01-01 01:51:55.8730:    1   3255   :  617   
##  2019-01-01 02:13:13.4810:    1   497    :  602   
##  2019-01-01 02:29:13.1090:    1   402    :  561   
##  2019-01-01 03:04:23.8640:    1   435    :  551   
##  2019-01-01 04:09:48.6020:    1   (Other):96523   
##  (Other)                 :99994   NA's   :  336   
##              start.station.name start.station.latitude start.station.longitude
##  Pershing Square North:  810    Min.   :40.66          Min.   :-74.03         
##  8 Ave & W 31 St      :  617    1st Qu.:40.72          1st Qu.:-74.00         
##  E 17 St & Broadway   :  602    Median :40.74          Median :-73.98         
##  Broadway & E 22 St   :  561    Mean   :40.74          Mean   :-73.98         
##  W 21 St & 6 Ave      :  551    3rd Qu.:40.76          3rd Qu.:-73.97         
##  Broadway & E 14 St   :  548    Max.   :40.85          Max.   :-73.88         
##  (Other)              :96311                                                  
##  end.station.id               end.station.name end.station.latitude
##  519    :  792   Pershing Square North:  792   Min.   :40.66       
##  402    :  636   Broadway & E 22 St   :  636   1st Qu.:40.72       
##  3255   :  632   8 Ave & W 31 St      :  632   Median :40.74       
##  497    :  623   E 17 St & Broadway   :  623   Mean   :40.74       
##  285    :  547   Broadway & E 14 St   :  547   3rd Qu.:40.76       
##  (Other):96426   W 21 St & 6 Ave      :  544   Max.   :40.86       
##  NA's   :  344   (Other)              :96226                       
##  end.station.longitude     bikeid            usertype       birth.year  
##  Min.   :-74.03        Min.   :14529   Customer  :14054   Min.   :1885  
##  1st Qu.:-74.00        1st Qu.:25346   Subscriber:85946   1st Qu.:1970  
##  Median :-73.99        Median :30918                      Median :1983  
##  Mean   :-73.98        Mean   :29674                      Mean   :1980  
##  3rd Qu.:-73.97        3rd Qu.:35049                      3rd Qu.:1990  
##  Max.   :-73.89        Max.   :42046                      Max.   :2003  
##                                                                         
##      gender     
##  Min.   :0.000  
##  1st Qu.:1.000  
##  Median :1.000  
##  Mean   :1.161  
##  3rd Qu.:1.000  
##  Max.   :2.000  
## 

Initial Data Analysis

Rider Age

rider_2019_sample$age <- 2019 - as.numeric(as.character(rider_2019_sample$birth.year))
rider_2019_sample <- filter(rider_2019_sample, age <= 80)

Gender Split in Riders

# Reclassifying the genders
# 0=unknown, 1=male, 2=female
rider_2019_sample$gender <- ifelse(rider_2019_sample$gender == 0, "Unkown",
                                  ifelse(rider_2019_sample$gender == 1, "Male", "Female"))

# Seeing the split of genders who rented bikes
rider_2019_sample %>%
  ggplot(aes(x=gender)) +
  geom_bar()

Subscriber vs Customer for Riders

# Seeing the split of user type who rented bikes
rider_2019_sample %>%
  ggplot(aes(x=usertype)) +
  geom_bar()

Trip Duration

# Range of all bike rides
rider_2019_sample <- filter(rider_2019_sample, tripduration <= 3000)
duration_range <- range(rider_2019_sample$tripduration, na.rm=TRUE)
duration_range
## [1]   61 3000
# Average length of a bike ride
duration_mean <- mean(rider_2019_sample$tripduration, na.rm=TRUE)
duration_mean
## [1] 776.546
# Standard deviation of bike rides
duration_sd <- sd(rider_2019_sample$tripduration, na.rm=TRUE)
duration_sd
## [1] 561.1764

Adjusting Dates in Data Sets

# Creating columns of just month, day, and year
weather_data$DATE <- as.Date(weather_data$DATE, format = "%m/%d/%Y")
weather_data$Month <- format(weather_data$DATE, "%m")
weather_data$Day <- format(weather_data$DATE, "%d")
weather_data$Year <- format(weather_data$DATE, "%Y")
# Creating columns of just month, day, and year
rider_2019_sample$DATE <- as.Date(rider_2019_sample$starttime, format = "%Y-%m-%d")
rider_2019_sample$Month <- format(rider_2019_sample$DATE, "%m")
rider_2019_sample$Day <- format(rider_2019_sample$DATE, "%d")
rider_2019_sample$Year <- format(rider_2019_sample$DATE, "%Y")

Types of Weather per Month

# Average precipitation per month
weather_data %>% 
  summarise(average_precip = tapply(PRCP, Month, mean, na.rm=TRUE))
##    average_precip
## 1      0.11548387
## 2      0.11214286
## 3      0.12483871
## 4      0.15166667
## 5      0.22000000
## 6      0.18200000
## 7      0.18612903
## 8      0.11935484
## 9      0.03166667
## 10     0.19838710
## 11     0.06500000
## 12     0.22870968
# Average snow per month
weather_data %>% 
  summarise(avg_snow = tapply(SNOW, Month, mean, na.rm=TRUE))
##      avg_snow
## 1  0.03548387
## 2  0.09285714
## 3  0.33548387
## 4  0.00000000
## 5  0.00000000
## 6  0.00000000
## 7  0.00000000
## 8  0.00000000
## 9  0.00000000
## 10 0.00000000
## 11 0.00000000
## 12 0.08064516
# Average wind speed per month
weather_data %>%
  summarise(average_wind_speed = tapply(AWND, Month, mean, na.rm=TRUE))
##    average_wind_speed
## 1                 NaN
## 2                 NaN
## 3            5.326667
## 4            4.399667
## 5            3.932581
## 6            4.159667
## 7            3.463226
## 8            3.839032
## 9            4.302333
## 10           5.288710
## 11           5.673667
## 12           6.385806

Exploratory Data Analysis - Weather Effects

# Combining data frames to compare data
edited_weather <- select(weather_data, PRCP, SNOW, AWND, DATE)
edited_rider <- select(rider_2019_sample, age, tripduration, DATE)

total_data = merge(edited_weather, edited_rider, by.x="DATE", by.y="DATE", all.x=TRUE)
head(total_data)
##         DATE PRCP SNOW AWND age tripduration
## 1 2019-01-01 0.06    0   NA  35          123
## 2 2019-01-01 0.06    0   NA  34         2757
## 3 2019-01-01 0.06    0   NA  26          208
## 4 2019-01-01 0.06    0   NA  50         1496
## 5 2019-01-01 0.06    0   NA  31          511
## 6 2019-01-01 0.06    0   NA  46          319

Average Precipitation by Age

# Mean PRCP by Age of Rider
total_data %>% 
  group_by(age) %>%
  summarise(mean_PRCP_by_age = mean(PRCP)) %>%
  ggplot(aes(x = age, y = mean_PRCP_by_age)) + geom_line() + geom_smooth() 
## `summarise()` ungrouping output (override with `.groups` argument)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Average Wind by Age

# Mean Wind by Age of Rider
total_data %>% 
  group_by(age) %>%
  summarise(mean_AWND_by_age = mean(AWND,na.rm = TRUE)) %>%
  ggplot(aes(x = age, y = mean_AWND_by_age)) + geom_line() + geom_smooth() 
## `summarise()` ungrouping output (override with `.groups` argument)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Precipitation Effects on Trip Duration

# Average ride time when it's raining
total_data %>%
  filter(PRCP > 0) %>%
  summarise(prcp_duration_mean = mean(tripduration))
##   prcp_duration_mean
## 1           759.4454
total_data %>% 
  filter(PRCP > 0) %>%
  ggplot(aes(x = tripduration)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="white") +
  geom_density(alpha=.2, fill="#FF6666") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

total_data %>%
  filter(PRCP > 0) %>%
  ggplot(aes(x = tripduration)) +
  geom_density(aes(fill=factor(PRCP)), alpha=0.8)

total_data %>%
  filter(PRCP > 0) %>%
  ggplot(aes(x = tripduration)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Snow Effects on Trip Duration

# Average ride time when it's snowing
total_data %>%
  filter(SNOW > 0) %>%
  summarise(snow_duration_mean = mean(tripduration))
##   snow_duration_mean
## 1           655.9662
total_data %>% 
  filter(SNOW > 0) %>%
  ggplot(aes(x = tripduration)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="white") +
  geom_density(alpha=.2, fill="#FF6666") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

total_data %>%
  filter(SNOW > 0) %>%
  ggplot(aes(x = tripduration)) +
  geom_density(aes(fill=factor(SNOW)), alpha=0.8)

total_data %>%
  filter(SNOW > 0) %>%
  ggplot(aes(x = tripduration)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Wind Effects on Trip Duration

# Average ride time when it's windy
total_data %>%
  filter(AWND > 0) %>%
  summarise(wind_duration_mean = mean(tripduration))
##   wind_duration_mean
## 1            790.543
total_data %>% 
  filter(AWND > 0) %>%
  ggplot(aes(x = tripduration)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="white") +
  geom_density(alpha=.2, fill="#FF6666") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

total_data %>%
  filter(AWND > 0) %>%
  ggplot(aes(x = tripduration)) +
  geom_density(aes(fill=factor(AWND)), alpha=0.8)

total_data %>%
  filter(AWND > 0) %>%
  ggplot(aes(x = tripduration)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exploratory Data Analysis - Ride History

Distance Between Stations

# Distance between start and end station in Meters
rider_2019_sample <- mutate(rider_2019_sample, 
                            distance = distHaversine(cbind(rider_2019_sample$start.station.longitude,
                                                           rider_2019_sample$start.station.latitude),
                                                     cbind(rider_2019_sample$end.station.longitude,
                                                           rider_2019_sample$end.station.latitude)))

head(rider_2019_sample)
##   tripduration                starttime                 stoptime
## 1          564 2019-04-11 07:44:36.0580 2019-04-11 07:54:00.1840
## 2         1158 2019-05-15 18:00:33.3890 2019-05-15 18:19:52.0150
## 3          763 2019-03-25 13:27:50.4260 2019-03-25 13:40:33.7960
## 4          915 2019-06-21 15:52:07.8340 2019-06-21 16:07:23.6810
## 5         1368 2019-06-01 18:42:22.9500 2019-06-01 19:05:11.7510
## 6          267 2019-07-31 18:47:05.5630 2019-07-31 18:51:33.0870
##   start.station.id       start.station.name start.station.latitude
## 1             3711       E 13 St & Avenue A               40.72967
## 2             3016        Kent Ave & N 7 St               40.72037
## 3              382  University Pl & E 14 St               40.73493
## 4              359       E 47 St & Park Ave               40.75510
## 5             3295 Central Park W & W 96 St               40.79127
## 6             3377     Carroll St & Bond St               40.67861
##   start.station.longitude end.station.id  end.station.name end.station.latitude
## 1               -73.98068            168   W 18 St & 6 Ave             40.73971
## 2               -73.96165           3016 Kent Ave & N 7 St             40.72037
## 3               -73.99201            459  W 20 St & 11 Ave             40.74674
## 4               -73.97499            483   E 12 St & 3 Ave             40.73223
## 5               -73.96484           3142   1 Ave & E 62 St             40.76123
## 6               -73.99037           3398   Smith St & 9 St             40.67470
##   end.station.longitude bikeid   usertype birth.year gender age       DATE
## 1             -73.99456  29807 Subscriber       1994   Male  25 2019-04-11
## 2             -73.96165  34411 Subscriber       1974   Male  45 2019-05-15
## 3             -74.00776  16078 Subscriber       1961   Male  58 2019-03-25
## 4             -73.98890  29904 Subscriber       1964 Female  55 2019-06-21
## 5             -73.96094  30247   Customer       1969 Unkown  50 2019-06-01
## 6             -73.99786  20315 Subscriber       1971   Male  48 2019-07-31
##   Month Day Year  distance
## 1    04  11 2019 1619.3162
## 2    05  15 2019    0.0000
## 3    03  25 2019 1869.6579
## 4    06  21 2019 2803.2652
## 5    06  01 2019 3360.4376
## 6    07  31 2019  767.6646

Speed of Rider Demographics

# Speed of the rider
rider_2019_sample$speed <- rider_2019_sample$distance/rider_2019_sample$tripduration

# Average speed of all riders
rider_2019_sample %>%
  summarise(average_speed = mean(speed))
##   average_speed
## 1      2.470045
# Average speed of young riders
rider_2019_sample %>%
  filter(age <= 45) %>%
  summarise(young_average = mean(speed))
##   young_average
## 1      2.544806
# Average speed of old riders
rider_2019_sample %>%
  filter(age >= 65) %>%
  summarise(old_average = mean(speed))
##   old_average
## 1    2.193164
# Average speed of female riders
rider_2019_sample %>%
  filter(gender == "Female") %>%
  summarise(female_average = mean(speed))
##   female_average
## 1       2.331995
# Average speed of male riders
rider_2019_sample %>%
  filter(gender == "Male") %>%
  summarise(male_average = mean(speed))
##   male_average
## 1     2.577479
# Average speed of subscribers
rider_2019_sample %>%
  filter(usertype == "Customer") %>%
  summarise(customer_average = mean(speed))
##   customer_average
## 1         1.820502
# Average speed of customers
rider_2019_sample %>%
  filter(usertype == "Subscriber") %>%
  summarise(subscriber_average = mean(speed))
##   subscriber_average
## 1           2.569449
# Scatter Plot of speed by age
rider_2019_sample %>%
  ggplot(aes(x = age, y = speed)) +
  geom_point(alpha = .25, color = 'blue', size = 1) +
  geom_point(shape = 1, size = 1, colour = "black") +
  labs(title="Average Speed of Riders by Age", x="Speed", y="Age")

# Boxplot of speed by gender
rider_2019_sample %>%
  ggplot(aes(x = gender, y = speed)) +
  geom_boxplot() +
  labs(title="Speed of Riders by Gender", x="Gender", y="Speed")

# Boxplot of speed by customer type
rider_2019_sample %>%
  ggplot(aes(x = usertype, y = speed)) +
  geom_boxplot() +
  labs(title="Speed of Riders by Customer Type", x="Customer Type", y="Speed")

Start Locations

top_height <- max(rider_2019_sample$start.station.latitude) - min(rider_2019_sample$start.station.latitude)
top_width <- max(rider_2019_sample$start.station.longitude) - min(rider_2019_sample$start.station.longitude)
top_borders <- c(bottom  = min(rider_2019_sample$start.station.latitude)  - 0.1 * top_height,
                 top     = max(rider_2019_sample$start.station.latitude)  + 0.1 * top_height,
                 left    = min(rider_2019_sample$start.station.longitude) - 0.2 * top_width,
                 right   = max(rider_2019_sample$start.station.longitude) + 0.2 * top_width)

start <- get_stamenmap(top_borders, zoom = 12, maptype = "toner-lite")
## Source : http://tile.stamen.com/toner-lite/12/1205/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1541.png
start_map <- ggmap(start, extent = "device", legend = "topright")

start_map + stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
size = 1, bins = 5, data = rider_2019_sample,
geom = "polygon"
)

Start Location Preferences - by Day of Week

# convert dates to weekdays
rider_2019_sample$day_of_week = weekdays(rider_2019_sample$DATE)

start_map +
stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
bins = 5, geom = "polygon",
data = rider_2019_sample) +
scale_fill_gradient(low = "black", high = "red") +
facet_wrap(~ day_of_week)

Start Location Preferences - by Gender

start_map +
stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
bins = 5, geom = "polygon",
data = rider_2019_sample) +
scale_fill_gradient(low = "black", high = "red") +
facet_wrap(~ gender)

Start Location Preferences - by Customer Type

start_map +
stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
bins = 5, geom = "polygon",
data = rider_2019_sample) +
scale_fill_gradient(low = "black", high = "red") +
facet_wrap(~ usertype)

Start Location Preferences - by Trip Duration

ggmap(start) +
    geom_point(data = rider_2019_sample, mapping = aes(x = start.station.longitude, y = start.station.latitude,
                                        col = tripduration)) +
    scale_color_gradient(low = "yellow", high = "red")

End Location Preferences

end <- get_stamenmap(top_borders, zoom = 12, maptype = "toner-lite")
end_map <- ggmap(end, extent = "device", legend = "topright")

end_map + stat_density2d(
aes(x = end.station.longitude, y = end.station.latitude, fill = ..level.., alpha = ..level..),
size = 1, bins = 5, data = rider_2019_sample,
geom = "polygon"
)

End Location Preferences - by Day of Week

end_map +
stat_density2d(
aes(x = end.station.longitude, y = end.station.latitude, fill = ..level.., alpha = ..level..),
bins = 5, geom = "polygon",
data = rider_2019_sample) +
scale_fill_gradient(low = "black", high = "red") +
facet_wrap(~ day_of_week)

End Location Preferences - by Gender

end_map +
stat_density2d(
aes(x = end.station.longitude, y = end.station.latitude, fill = ..level.., alpha = ..level..),
bins = 5, geom = "polygon",
data = rider_2019_sample) +
scale_fill_gradient(low = "black", high = "red") +
facet_wrap(~ gender)

End Location Preferences - by User Type

end_map +
stat_density2d(
aes(x = end.station.longitude, y = end.station.latitude, fill = ..level.., alpha = ..level..),
bins = 5, geom = "polygon",
data = rider_2019_sample) +
scale_fill_gradient(low = "black", high = "red") +
facet_wrap(~ usertype)